import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
comments = pd.read_csv(r'C:\DA_BA_material/UScomments.csv' , error_bad_lines=False)
C:\Users\Benny\AppData\Local\Temp\ipykernel_10884\2660278793.py:1: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future. comments = pd.read_csv(r'C:\DA_BA_material/UScomments.csv' , error_bad_lines=False) Skipping line 41589: expected 4 fields, saw 11 Skipping line 51628: expected 4 fields, saw 7 Skipping line 114465: expected 4 fields, saw 5 Skipping line 142496: expected 4 fields, saw 8 Skipping line 189732: expected 4 fields, saw 6 Skipping line 245218: expected 4 fields, saw 7 Skipping line 388430: expected 4 fields, saw 5 C:\Users\Benny\AppData\Local\Temp\ipykernel_10884\2660278793.py:1: DtypeWarning: Columns (2,3) have mixed types. Specify dtype option on import or set low_memory=False. comments = pd.read_csv(r'C:\DA_BA_material/UScomments.csv' , error_bad_lines=False)
comments.head()
| video_id | comment_text | likes | replies | |
|---|---|---|---|---|
| 0 | XpVt6Z1Gjjo | Logan Paul it's yo big day ‼️‼️‼️ | 4 | 0 |
| 1 | XpVt6Z1Gjjo | I've been following you from the start of your... | 3 | 0 |
| 2 | XpVt6Z1Gjjo | Say hi to Kong and maverick for me | 3 | 0 |
| 3 | XpVt6Z1Gjjo | MY FAN . attendance | 3 | 0 |
| 4 | XpVt6Z1Gjjo | trending 😉 | 3 | 0 |
comments.isnull()
| video_id | comment_text | likes | replies | |
|---|---|---|---|---|
| 0 | False | False | False | False |
| 1 | False | False | False | False |
| 2 | False | False | False | False |
| 3 | False | False | False | False |
| 4 | False | False | False | False |
| ... | ... | ... | ... | ... |
| 691395 | False | False | False | False |
| 691396 | False | False | False | False |
| 691397 | False | False | False | False |
| 691398 | False | False | False | False |
| 691399 | False | False | False | False |
691400 rows × 4 columns
comments.isnull().sum()
video_id 0 comment_text 25 likes 0 replies 0 dtype: int64
comments.dropna(inplace=True)
comments.isnull().sum()
video_id 0 comment_text 0 likes 0 replies 0 dtype: int64
!pip install textblob
Requirement already satisfied: textblob in c:\users\benny\anaconda3\lib\site-packages (0.15.3) Requirement already satisfied: nltk>=3.1 in c:\users\benny\anaconda3\lib\site-packages (from textblob) (3.8.1) Requirement already satisfied: click in c:\users\benny\anaconda3\lib\site-packages (from nltk>=3.1->textblob) (8.0.4) Requirement already satisfied: joblib in c:\users\benny\anaconda3\lib\site-packages (from nltk>=3.1->textblob) (1.2.0) Requirement already satisfied: regex>=2021.8.3 in c:\users\benny\anaconda3\lib\site-packages (from nltk>=3.1->textblob) (2022.7.9) Requirement already satisfied: tqdm in c:\users\benny\anaconda3\lib\site-packages (from nltk>=3.1->textblob) (4.65.0) Requirement already satisfied: colorama in c:\users\benny\anaconda3\lib\site-packages (from click->nltk>=3.1->textblob) (0.4.6)
from textblob import TextBlob
comments.head(6)
| video_id | comment_text | likes | replies | |
|---|---|---|---|---|
| 0 | XpVt6Z1Gjjo | Logan Paul it's yo big day ‼️‼️‼️ | 4 | 0 |
| 1 | XpVt6Z1Gjjo | I've been following you from the start of your... | 3 | 0 |
| 2 | XpVt6Z1Gjjo | Say hi to Kong and maverick for me | 3 | 0 |
| 3 | XpVt6Z1Gjjo | MY FAN . attendance | 3 | 0 |
| 4 | XpVt6Z1Gjjo | trending 😉 | 3 | 0 |
| 5 | XpVt6Z1Gjjo | #1 on trending AYYEEEEE | 3 | 0 |
TextBlob("Logan Paul it's yo big day ‼️‼️‼️").sentiment.polarity
0.0
comments.shape
(691375, 4)
sample_df = comments[0:1000]
sample_df.shape
(1000, 4)
polarity = []
for comment in comments['comment_text']:
try:
polarity.append(TextBlob(comment).sentiment.polarity)
except:
polarity.append(0)
len(polarity)
691375
comments['polarity'] = polarity
comments.head(5)
| video_id | comment_text | likes | replies | polarity | |
|---|---|---|---|---|---|
| 0 | XpVt6Z1Gjjo | Logan Paul it's yo big day ‼️‼️‼️ | 4 | 0 | 0.0 |
| 1 | XpVt6Z1Gjjo | I've been following you from the start of your... | 3 | 0 | 0.0 |
| 2 | XpVt6Z1Gjjo | Say hi to Kong and maverick for me | 3 | 0 | 0.0 |
| 3 | XpVt6Z1Gjjo | MY FAN . attendance | 3 | 0 | 0.0 |
| 4 | XpVt6Z1Gjjo | trending 😉 | 3 | 0 | 0.0 |
Graphical representation of text frequency
filter1 = comments['polarity']==1
comments_positive = comments[filter1]
filter2 = comments['polarity']==-1
comments_negative = comments[filter2]
comments_positive.head(5)
| video_id | comment_text | likes | replies | polarity | |
|---|---|---|---|---|---|
| 64 | XpVt6Z1Gjjo | yu are the best | 1 | 0 | 1.0 |
| 156 | cLdxuaxaQwc | Power is the disease. Care is the cure. Keep... | 0 | 0 | 1.0 |
| 227 | WYYvHb03Eog | YAS Can't wait to get it! I just need to sell ... | 0 | 0 | 1.0 |
| 307 | sjlHnJvXdQs | This is priceless | 0 | 0 | 1.0 |
| 319 | sjlHnJvXdQs | Summed up perfectly | 0 | 0 | 1.0 |
!pip install wordcloud
Requirement already satisfied: wordcloud in c:\users\benny\anaconda3\lib\site-packages (1.9.2) Requirement already satisfied: numpy>=1.6.1 in c:\users\benny\anaconda3\lib\site-packages (from wordcloud) (1.24.3) Requirement already satisfied: pillow in c:\users\benny\anaconda3\lib\site-packages (from wordcloud) (9.4.0) Requirement already satisfied: matplotlib in c:\users\benny\anaconda3\lib\site-packages (from wordcloud) (3.7.1) Requirement already satisfied: contourpy>=1.0.1 in c:\users\benny\anaconda3\lib\site-packages (from matplotlib->wordcloud) (1.0.5) Requirement already satisfied: cycler>=0.10 in c:\users\benny\anaconda3\lib\site-packages (from matplotlib->wordcloud) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\benny\anaconda3\lib\site-packages (from matplotlib->wordcloud) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\benny\anaconda3\lib\site-packages (from matplotlib->wordcloud) (1.4.4) Requirement already satisfied: packaging>=20.0 in c:\users\benny\anaconda3\lib\site-packages (from matplotlib->wordcloud) (23.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\benny\anaconda3\lib\site-packages (from matplotlib->wordcloud) (3.0.9) Requirement already satisfied: python-dateutil>=2.7 in c:\users\benny\anaconda3\lib\site-packages (from matplotlib->wordcloud) (2.8.2) Requirement already satisfied: six>=1.5 in c:\users\benny\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.16.0)
from wordcloud import WordCloud , STOPWORDS
set(STOPWORDS)
{'a',
'about',
'above',
'after',
'again',
'against',
'all',
'also',
'am',
'an',
'and',
'any',
'are',
"aren't",
'as',
'at',
'be',
'because',
'been',
'before',
'being',
'below',
'between',
'both',
'but',
'by',
'can',
"can't",
'cannot',
'com',
'could',
"couldn't",
'did',
"didn't",
'do',
'does',
"doesn't",
'doing',
"don't",
'down',
'during',
'each',
'else',
'ever',
'few',
'for',
'from',
'further',
'get',
'had',
"hadn't",
'has',
"hasn't",
'have',
"haven't",
'having',
'he',
"he'd",
"he'll",
"he's",
'hence',
'her',
'here',
"here's",
'hers',
'herself',
'him',
'himself',
'his',
'how',
"how's",
'however',
'http',
'i',
"i'd",
"i'll",
"i'm",
"i've",
'if',
'in',
'into',
'is',
"isn't",
'it',
"it's",
'its',
'itself',
'just',
'k',
"let's",
'like',
'me',
'more',
'most',
"mustn't",
'my',
'myself',
'no',
'nor',
'not',
'of',
'off',
'on',
'once',
'only',
'or',
'other',
'otherwise',
'ought',
'our',
'ours',
'ourselves',
'out',
'over',
'own',
'r',
'same',
'shall',
"shan't",
'she',
"she'd",
"she'll",
"she's",
'should',
"shouldn't",
'since',
'so',
'some',
'such',
'than',
'that',
"that's",
'the',
'their',
'theirs',
'them',
'themselves',
'then',
'there',
"there's",
'therefore',
'these',
'they',
"they'd",
"they'll",
"they're",
"they've",
'this',
'those',
'through',
'to',
'too',
'under',
'until',
'up',
'very',
'was',
"wasn't",
'we',
"we'd",
"we'll",
"we're",
"we've",
'were',
"weren't",
'what',
"what's",
'when',
"when's",
'where',
"where's",
'which',
'while',
'who',
"who's",
'whom',
'why',
"why's",
'with',
"won't",
'would',
"wouldn't",
'www',
'you',
"you'd",
"you'll",
"you're",
"you've",
'your',
'yours',
'yourself',
'yourselves'}
comments['comment_text']
0 Logan Paul it's yo big day ‼️‼️‼️
1 I've been following you from the start of your...
2 Say hi to Kong and maverick for me
3 MY FAN . attendance
4 trending 😉
...
691395 Лучшая
691396 qu'est ce que j'aimerais que tu viennes à Roan...
691397 Ven a mexico! 😍 te amo LP
691398 Islığı yeter...
691399 Kocham tą piosenkę😍❤❤❤byłam zakochana po uszy ...
Name: comment_text, Length: 691375, dtype: object
type(comments['comment_text'])
pandas.core.series.Series
total_comments_positive = ' '.join(comments_positive['comment_text'])
wordcloud = WordCloud(stopwords=set(STOPWORDS)).generate(total_comments_positive)
plt.imshow(wordcloud)
plt.axis('off')
(-0.5, 399.5, 199.5, -0.5)
total_comments_negative = ' '.join(comments_negative['comment_text'])
wordcloud2 = WordCloud(stopwords=set(STOPWORDS)).generate(total_comments_negative)
plt.imshow(wordcloud2)
plt.axis('off')
(-0.5, 399.5, 199.5, -0.5)
!pip install emoji==2.2.0
Requirement already satisfied: emoji==2.2.0 in c:\users\benny\anaconda3\lib\site-packages (2.2.0)
import emoji
emoji.__version__
'2.2.0'
comments['comment_text'].head(6)
0 Logan Paul it's yo big day ‼️‼️‼️ 1 I've been following you from the start of your... 2 Say hi to Kong and maverick for me 3 MY FAN . attendance 4 trending 😉 5 #1 on trending AYYEEEEE Name: comment_text, dtype: object
comment = 'trending 😉'
[char for char in comment if char in emoji.EMOJI_DATA]
['😉']
emoji_list = []
for char in comment:
if char in emoji.EMOJI_DATA:
emoji_list.append(char)
emoji_list
['😉']
all_emojis_list = []
for comment in comments['comment_text'].dropna():
for char in comment:
if char in emoji.EMOJI_DATA:
all_emojis_list.append(char)
all_emojis_list[0:10]
['‼', '‼', '‼', '😉', '😭', '👍', '🏻', '❤', '😍', '💋']
from collections import Counter
Counter(all_emojis_list).most_common(10)
[('😂', 36987),
('😍', 33453),
('❤', 31119),
('🔥', 8694),
('😭', 8398),
('👏', 5719),
('😘', 5545),
('👍', 5476),
('💖', 5359),
('💕', 5147)]
Counter(all_emojis_list).most_common(10)[0][0]
'😂'
Counter(all_emojis_list).most_common(10)[1][0]
'😍'
Counter(all_emojis_list).most_common(10)[2][0]
'❤'
emojis = [Counter(all_emojis_list).most_common(10)[i][0]for i in range(10)]
Counter(all_emojis_list).most_common(10)[0][1]
36987
Counter(all_emojis_list).most_common(10)[1][1]
33453
Counter(all_emojis_list).most_common(10)[2][1]
31119
freqs = [Counter(all_emojis_list).most_common(10)[i][1]for i in range(10)]
freqs
[36987, 33453, 31119, 8694, 8398, 5719, 5545, 5476, 5359, 5147]
import plotly.graph_objs as go
from plotly.offline import iplot
trace = go.Bar(x=emojis , y=freqs)
iplot([trace])
import os
files = os.listdir(r'C:\DA_BA_material\additional_data')
files
['CAvideos.csv', 'CA_category_id.json', 'DEvideos.csv', 'DE_category_id.json', 'FRvideos.csv', 'FR_category_id.json', 'GBvideos.csv', 'GB_category_id.json', 'INvideos.csv', 'IN_category_id.json', 'JPvideos.csv', 'JP_category_id.json', 'KRvideos.csv', 'KR_category_id.json', 'MXvideos.csv', 'MX_category_id.json', 'RUvideos.csv', 'RU_category_id.json', 'USvideos.csv', 'US_category_id.json']
files_csv = [file for file in files if '.csv' in file]
files_csv
['CAvideos.csv', 'DEvideos.csv', 'FRvideos.csv', 'GBvideos.csv', 'INvideos.csv', 'JPvideos.csv', 'KRvideos.csv', 'MXvideos.csv', 'RUvideos.csv', 'USvideos.csv']
import warnings
from warnings import filterwarnings
filterwarnings('ignore')
full_df = pd.DataFrame()
path = r'C:\DA_BA_material\additional_data'
for file in files_csv:
current_df = pd.read_csv(path+'/'+file, encoding='iso-8859-1' , error_bad_lines=False)
full_df = pd.concat([full_df , current_df] , ignore_index=True)
full_df.shape
(375942, 16)
full_df[full_df.duplicated()].shape
(36417, 16)
full_df = full_df.drop_duplicates()
full_df.shape
(339525, 16)
full_df[0:1000].to_csv(r'C:\DA_BA_material/youtube_sample.csv' , index=False)
full_df[0:1000].to_json(r'C:\DA_BA_material/youtube_sample.json')
from sqlalchemy import create_engine
engine = create_engine('sqlite:///C:\DA_BA_material/youtube_sample.sqlite')
full_df[0:1000].to_sql('Users' , con=engine , if_exists='append')
1000
full_df.head(5)
| video_id | trending_date | title | channel_title | category_id | publish_time | tags | views | likes | dislikes | comment_count | thumbnail_link | comments_disabled | ratings_disabled | video_error_or_removed | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | n1WpP7iowLc | 17.14.11 | Eminem - Walk On Water (Audio) ft. Beyoncé | EminemVEVO | 10 | 2017-11-10T17:00:03.000Z | Eminem|"Walk"|"On"|"Water"|"Aftermath/Shady/In... | 17158579 | 787425 | 43420 | 125882 | https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg | False | False | False | Eminem's new track Walk on Water ft. Beyoncé ... |
| 1 | 0dBIkQ4Mz1M | 17.14.11 | PLUSH - Bad Unboxing Fan Mail | iDubbbzTV | 23 | 2017-11-13T17:00:00.000Z | plush|"bad unboxing"|"unboxing"|"fan mail"|"id... | 1014651 | 127794 | 1688 | 13030 | https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg | False | False | False | STill got a lot of packages. Probably will las... |
| 2 | 5qpjK5DgCt4 | 17.14.11 | Racist Superman | Rudy Mancuso, King Bach & Le... | Rudy Mancuso | 23 | 2017-11-12T19:05:24.000Z | racist superman|"rudy"|"mancuso"|"king"|"bach"... | 3191434 | 146035 | 5339 | 8181 | https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg | False | False | False | WATCH MY PREVIOUS VIDEO ⶠ\n\nSUBSCRIBE ⺠... |
| 3 | d380meD0W0M | 17.14.11 | I Dare You: GOING BALD!? | nigahiga | 24 | 2017-11-12T18:01:41.000Z | ryan|"higa"|"higatv"|"nigahiga"|"i dare you"|"... | 2095828 | 132239 | 1989 | 17518 | https://i.ytimg.com/vi/d380meD0W0M/default.jpg | False | False | False | I know it's been a while since we did this sho... |
| 4 | 2Vv-BfVoq4g | 17.14.11 | Ed Sheeran - Perfect (Official Music Video) | Ed Sheeran | 10 | 2017-11-09T11:04:14.000Z | edsheeran|"ed sheeran"|"acoustic"|"live"|"cove... | 33523622 | 1634130 | 21082 | 85067 | https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg | False | False | False | ð§: https://ad.gt/yt-perfect\nð°: https://... |
full_df['category_id'].unique()
array([10, 23, 24, 25, 22, 26, 1, 28, 20, 17, 29, 15, 19, 2, 27, 43, 30,
44], dtype=int64)
json_df = pd.read_json(r'C:\DA_BA_material\additional_data/US_category_id.json')
json_df
| kind | etag | items | |
|---|---|---|---|
| 0 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 1 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 2 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 3 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 4 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 5 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 6 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 7 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 8 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 9 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 10 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 11 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 12 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 13 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 14 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 15 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 16 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 17 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 18 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 19 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 20 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 21 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 22 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 23 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 24 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 25 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 26 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 27 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 28 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 29 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 30 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 31 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
json_df['items'][0]
{'kind': 'youtube#videoCategory',
'etag': '"m2yskBQFythfE4irbTIeOgYYfBU/Xy1mB4_yLrHy_BmKmPBggty2mZQ"',
'id': '1',
'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ',
'title': 'Film & Animation',
'assignable': True}}
json_df['items'][1]
{'kind': 'youtube#videoCategory',
'etag': '"m2yskBQFythfE4irbTIeOgYYfBU/UZ1oLIIz2dxIhO45ZTFR3a3NyTA"',
'id': '2',
'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ',
'title': 'Autos & Vehicles',
'assignable': True}}
cat_dict = {}
for item in json_df['items'].values:
cat_dict[int(item['id'])] = item['snippet']['title']
cat_dict
{1: 'Film & Animation',
2: 'Autos & Vehicles',
10: 'Music',
15: 'Pets & Animals',
17: 'Sports',
18: 'Short Movies',
19: 'Travel & Events',
20: 'Gaming',
21: 'Videoblogging',
22: 'People & Blogs',
23: 'Comedy',
24: 'Entertainment',
25: 'News & Politics',
26: 'Howto & Style',
27: 'Education',
28: 'Science & Technology',
29: 'Nonprofits & Activism',
30: 'Movies',
31: 'Anime/Animation',
32: 'Action/Adventure',
33: 'Classics',
34: 'Comedy',
35: 'Documentary',
36: 'Drama',
37: 'Family',
38: 'Foreign',
39: 'Horror',
40: 'Sci-Fi/Fantasy',
41: 'Thriller',
42: 'Shorts',
43: 'Shows',
44: 'Trailers'}
full_df['category_name'] = full_df['category_id'].map(cat_dict)
full_df.head(4)
| video_id | trending_date | title | channel_title | category_id | publish_time | tags | views | likes | dislikes | comment_count | thumbnail_link | comments_disabled | ratings_disabled | video_error_or_removed | description | category_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | n1WpP7iowLc | 17.14.11 | Eminem - Walk On Water (Audio) ft. Beyoncé | EminemVEVO | 10 | 2017-11-10T17:00:03.000Z | Eminem|"Walk"|"On"|"Water"|"Aftermath/Shady/In... | 17158579 | 787425 | 43420 | 125882 | https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg | False | False | False | Eminem's new track Walk on Water ft. Beyoncé ... | Music |
| 1 | 0dBIkQ4Mz1M | 17.14.11 | PLUSH - Bad Unboxing Fan Mail | iDubbbzTV | 23 | 2017-11-13T17:00:00.000Z | plush|"bad unboxing"|"unboxing"|"fan mail"|"id... | 1014651 | 127794 | 1688 | 13030 | https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg | False | False | False | STill got a lot of packages. Probably will las... | Comedy |
| 2 | 5qpjK5DgCt4 | 17.14.11 | Racist Superman | Rudy Mancuso, King Bach & Le... | Rudy Mancuso | 23 | 2017-11-12T19:05:24.000Z | racist superman|"rudy"|"mancuso"|"king"|"bach"... | 3191434 | 146035 | 5339 | 8181 | https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg | False | False | False | WATCH MY PREVIOUS VIDEO ⶠ\n\nSUBSCRIBE ⺠... | Comedy |
| 3 | d380meD0W0M | 17.14.11 | I Dare You: GOING BALD!? | nigahiga | 24 | 2017-11-12T18:01:41.000Z | ryan|"higa"|"higatv"|"nigahiga"|"i dare you"|"... | 2095828 | 132239 | 1989 | 17518 | https://i.ytimg.com/vi/d380meD0W0M/default.jpg | False | False | False | I know it's been a while since we did this sho... | Entertainment |
plt.figure(figsize=(12,8))
sns.boxplot(x='category_name' , y='likes' , data=full_df)
plt.xticks(rotation='vertical')
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17]),
[Text(0, 0, 'Music'),
Text(1, 0, 'Comedy'),
Text(2, 0, 'Entertainment'),
Text(3, 0, 'News & Politics'),
Text(4, 0, 'People & Blogs'),
Text(5, 0, 'Howto & Style'),
Text(6, 0, 'Film & Animation'),
Text(7, 0, 'Science & Technology'),
Text(8, 0, 'Gaming'),
Text(9, 0, 'Sports'),
Text(10, 0, 'Nonprofits & Activism'),
Text(11, 0, 'Pets & Animals'),
Text(12, 0, 'Travel & Events'),
Text(13, 0, 'Autos & Vehicles'),
Text(14, 0, 'Education'),
Text(15, 0, 'Shows'),
Text(16, 0, 'Movies'),
Text(17, 0, 'Trailers')])
full_df['like_rate'] = (full_df['likes']/full_df['views'])*100
full_df['dislike_rate']= (full_df['dislikes']/full_df['views'])*100
full_df['comment_count_rate'] = (full_df['comment_count']/full_df['views'])*100
full_df.columns
Index(['video_id', 'trending_date', 'title', 'channel_title', 'category_id',
'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count',
'thumbnail_link', 'comments_disabled', 'ratings_disabled',
'video_error_or_removed', 'description', 'category_name', 'like_rate',
'dislike_rate', 'comment_count_rate'],
dtype='object')
plt.figure(figsize=(8,6))
sns.boxplot(x='category_name' , y='like_rate' , data=full_df)
plt.xticks(rotation='vertical')
plt.show()
sns.regplot(x='views' , y='likes' , data = full_df)
<Axes: xlabel='views', ylabel='likes'>
full_df.columns
Index(['video_id', 'trending_date', 'title', 'channel_title', 'category_id',
'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count',
'thumbnail_link', 'comments_disabled', 'ratings_disabled',
'video_error_or_removed', 'description', 'category_name', 'like_rate',
'dislike_rate', 'comment_count_rate'],
dtype='object')
full_df[['views', 'likes', 'dislikes']].corr()
| views | likes | dislikes | |
|---|---|---|---|
| views | 1.000000 | 0.779531 | 0.405428 |
| likes | 0.779531 | 1.000000 | 0.451809 |
| dislikes | 0.405428 | 0.451809 | 1.000000 |
sns.heatmap(full_df[['views', 'likes', 'dislikes']].corr() , annot=True)
<Axes: >
full_df.head(6)
| video_id | trending_date | title | channel_title | category_id | publish_time | tags | views | likes | dislikes | comment_count | thumbnail_link | comments_disabled | ratings_disabled | video_error_or_removed | description | category_name | like_rate | dislike_rate | comment_count_rate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | n1WpP7iowLc | 17.14.11 | Eminem - Walk On Water (Audio) ft. Beyoncé | EminemVEVO | 10 | 2017-11-10T17:00:03.000Z | Eminem|"Walk"|"On"|"Water"|"Aftermath/Shady/In... | 17158579 | 787425 | 43420 | 125882 | https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg | False | False | False | Eminem's new track Walk on Water ft. Beyoncé ... | Music | 4.589104 | 0.253051 | 0.733639 |
| 1 | 0dBIkQ4Mz1M | 17.14.11 | PLUSH - Bad Unboxing Fan Mail | iDubbbzTV | 23 | 2017-11-13T17:00:00.000Z | plush|"bad unboxing"|"unboxing"|"fan mail"|"id... | 1014651 | 127794 | 1688 | 13030 | https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg | False | False | False | STill got a lot of packages. Probably will las... | Comedy | 12.594873 | 0.166363 | 1.284185 |
| 2 | 5qpjK5DgCt4 | 17.14.11 | Racist Superman | Rudy Mancuso, King Bach & Le... | Rudy Mancuso | 23 | 2017-11-12T19:05:24.000Z | racist superman|"rudy"|"mancuso"|"king"|"bach"... | 3191434 | 146035 | 5339 | 8181 | https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg | False | False | False | WATCH MY PREVIOUS VIDEO ⶠ\n\nSUBSCRIBE ⺠... | Comedy | 4.575843 | 0.167292 | 0.256342 |
| 3 | d380meD0W0M | 17.14.11 | I Dare You: GOING BALD!? | nigahiga | 24 | 2017-11-12T18:01:41.000Z | ryan|"higa"|"higatv"|"nigahiga"|"i dare you"|"... | 2095828 | 132239 | 1989 | 17518 | https://i.ytimg.com/vi/d380meD0W0M/default.jpg | False | False | False | I know it's been a while since we did this sho... | Entertainment | 6.309630 | 0.094903 | 0.835851 |
| 4 | 2Vv-BfVoq4g | 17.14.11 | Ed Sheeran - Perfect (Official Music Video) | Ed Sheeran | 10 | 2017-11-09T11:04:14.000Z | edsheeran|"ed sheeran"|"acoustic"|"live"|"cove... | 33523622 | 1634130 | 21082 | 85067 | https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg | False | False | False | ð§: https://ad.gt/yt-perfect\nð°: https://... | Music | 4.874563 | 0.062887 | 0.253752 |
| 5 | 0yIWz1XEeyc | 17.14.11 | Jake Paul Says Alissa Violet CHEATED with LOGA... | DramaAlert | 25 | 2017-11-13T07:37:51.000Z | #DramaAlert|"Drama"|"Alert"|"DramaAlert"|"keem... | 1309699 | 103755 | 4613 | 12143 | https://i.ytimg.com/vi/0yIWz1XEeyc/default.jpg | False | False | False | ⺠Follow for News! - https://twitter.com/KEE... | News & Politics | 7.922049 | 0.352218 | 0.927160 |
full_df['channel_title'].value_counts()
The Late Show with Stephen Colbert 710
WWE 643
Late Night with Seth Meyers 592
TheEllenShow 555
Jimmy Kimmel Live 528
...
Daas 1
YT Industries 1
BTLV Le média complémentaire 1
Quem Sabia ? 1
Jessi Osorno 1
Name: channel_title, Length: 37824, dtype: int64
cdf = full_df.groupby(['channel_title']).size().sort_values(ascending=False).reset_index()
cdf = cdf.rename(columns={0:'total_videos'})
cdf
| channel_title | total_videos | |
|---|---|---|
| 0 | The Late Show with Stephen Colbert | 710 |
| 1 | WWE | 643 |
| 2 | Late Night with Seth Meyers | 592 |
| 3 | TheEllenShow | 555 |
| 4 | Jimmy Kimmel Live | 528 |
| ... | ... | ... |
| 37819 | Kd Malts | 1 |
| 37820 | Zedan TV | 1 |
| 37821 | Kc Kelly - Rocketprenuer | 1 |
| 37822 | Kbaby | 1 |
| 37823 | Pavel Sidorik TV | 1 |
37824 rows × 2 columns
import plotly.express as px
px.bar(data_frame=cdf[0:20] , x='channel_title' , y='total_videos')
full_df['title'][0]
'Eminem - Walk On Water (Audio) ft. Beyoncé'
import string
string.punctuation
'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
len([char for char in full_df['title'][0] if char in string.punctuation])
4
def punc_count(text):
return len([char for char in text if char in string.punctuation])
sample = full_df[0:10000]
sample['count_punc'] = sample['title'].apply(punc_count)
sample['count_punc']
0 4
1 1
2 3
3 3
4 3
..
9995 6
9996 0
9997 1
9998 0
9999 6
Name: count_punc, Length: 10000, dtype: int64
plt.figure(figsize=(8,6))
sns.boxplot(x='count_punc' , y='likes' , data=sample)
plt.show()